{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "17c1ceb7-3ed7-49c5-8d86-75f05d5b9e46",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments\n",
    "import evaluate\n",
    "import numpy as np\n",
    "from datasets import load_metric, load_dataset\n",
    "import random\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e589b8a4-c8b5-464a-aad0-8c8e13054e48",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Defining a constant SEED for reproducibility in random operations\n",
    "SEED = 42\n",
    "\n",
    "# Setting the seed for the random library to ensure consistent results\n",
    "random.seed(SEED)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7d2dc0c4-8932-4889-975d-05160be113ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset app_reviews (/Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de)\n",
      "Loading cached processed dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-2ed8b4950d3481dc.arrow\n",
      "Loading cached processed dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-8340e0b17b85391f.arrow\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['package_name', 'review', 'date', 'star'],\n",
       "    num_rows: 288065\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'star' is a column in our dataset and we want to convert it to a ClassLabel column\n",
    "# so we can stratify our samples.\n",
    "\n",
    "# Importing the ClassLabel module to represent categorical class labels\n",
    "from datasets import ClassLabel\n",
    "\n",
    "# Loading the 'app_reviews' dataset's training split into the 'dataset' variable\n",
    "dataset = load_dataset('app_reviews', split='train')\n",
    "\n",
    "# Converting the 'star' column in our dataset to a ClassLabel type\n",
    "# This allows for categorical representation and easier handling of classes\n",
    "dataset = dataset.class_encode_column('star')\n",
    "\n",
    "# Displaying the dataset to see the changes\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ffe447c4-fc0e-4727-8538-aee1d4d2fa53",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached split indices for dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-7b0bb7f9411c7832.arrow and /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-8e6b458ed2fd8d99.arrow\n",
      "Loading cached split indices for dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-aad9818ddedac634.arrow and /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-35c54ad93c1d9924.arrow\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['package_name', 'review', 'date', 'star'],\n",
       "        num_rows: 172839\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['package_name', 'review', 'date', 'star'],\n",
       "        num_rows: 57613\n",
       "    })\n",
       "    val: Dataset({\n",
       "        features: ['package_name', 'review', 'date', 'star'],\n",
       "        num_rows: 57613\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Splitting the dataset into a training set and a test set.\n",
    "# We reserve 20% of the data for testing and use stratification on the 'star' column\n",
    "# to ensure both sets have an equal distribution of each star category.\n",
    "dataset = dataset.train_test_split(test_size=0.2, seed=SEED, stratify_by_column='star')\n",
    "\n",
    "# Now, we further split our training dataset to reserve 25% of it for validation.\n",
    "# Again, we stratify by the 'star' column to keep the distribution consistent.\n",
    "df = dataset['train'].train_test_split(test_size=.25, seed=SEED, stratify_by_column='star')\n",
    "\n",
    "# Assigning the split datasets to their respective keys:\n",
    "# - The remaining 75% of our initial training data becomes the new training dataset.\n",
    "dataset['train'] = df['train']\n",
    "\n",
    "# - The 25% split from our initial training data becomes the validation dataset.\n",
    "dataset['val'] = df['test']\n",
    "\n",
    "# Displaying the dataset to see the distribution across train, test, and validation sets.\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f25c4761-bc78-408c-aa4d-21119b9779b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL = 'distilbert-base-cased'\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1c17e580-5916-450b-98da-c7c9b7474646",
   "metadata": {},
   "outputs": [],
   "source": [
    "# simple function to batch tokenize utterances with truncation\n",
    "def preprocess_function(examples):  # each example is an element from the Dataset\n",
    "    return tokenizer(examples[\"review\"], truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8f17cad6-e4e9-4b10-ab03-7dd2e5e7d669",
   "metadata": {},
   "outputs": [],
   "source": [
    "# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the \n",
    "#  length of the longest element in the batch, making them all the same length. \n",
    "#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "626208d0-1e66-4559-a80b-f2888cc1a1cf",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2728591b-2795-4bb3-b1a6-d83a31a51c47",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "sequence_clf_model = AutoModelForSequenceClassification.from_pretrained(\n",
    "    MODEL,\n",
    "    num_labels=5,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a44cd57d-d439-4432-b899-995dbc35d712",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DistilBertForSequenceClassification(\n",
       "  (distilbert): DistilBertModel(\n",
       "    (embeddings): Embeddings(\n",
       "      (word_embeddings): Embedding(28996, 768, padding_idx=0)\n",
       "      (position_embeddings): Embedding(512, 768)\n",
       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "      (dropout): Dropout(p=0.1, inplace=False)\n",
       "    )\n",
       "    (transformer): Transformer(\n",
       "      (layer): ModuleList(\n",
       "        (0-5): 6 x TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (activation): GELUActivation()\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       "  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n",
       "  (classifier): Linear(in_features=768, out_features=5, bias=True)\n",
       "  (dropout): Dropout(p=0.2, inplace=False)\n",
       ")"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequence_clf_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "92a44187-d471-40f7-a47c-1df5e3083440",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-1d507ec62ce5ddd6.arrow\n",
      "Loading cached processed dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-423ab1e7a2c17013.arrow\n",
      "Loading cached processed dataset at /Users/sinanozdemir/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de/cache-899b4eef64bd6fb3.arrow\n"
     ]
    }
   ],
   "source": [
    "dataset = dataset.map(preprocess_function, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1b42374d-8e48-4b0e-88a7-87b801067855",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['label', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 172839\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['label', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 57613\n",
       "    })\n",
       "    val: Dataset({\n",
       "        features: ['label', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 57613\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = dataset.rename_column(\"star\", \"label\")\n",
    "dataset = dataset.remove_columns(['package_name', 'review', 'date'])\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5824878-f25d-450b-8afd-81a3a0879abc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "input_ids = dataset['train']['input_ids']\n",
    "pd.Series(input_ids).apply(len).hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d278028c-193e-4cd6-b443-46eabfbe11d2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['label', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 172839\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['label', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 57613\n",
       "    })\n",
       "    val: Dataset({\n",
       "        features: ['label', 'input_ids', 'attention_mask'],\n",
       "        num_rows: 57613\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f4136b95-d3e1-4b1a-861c-e92b6f44a423",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics(p):\n",
    "    preds = np.argmax(p.predictions, axis=1)\n",
    "    return {\"accuracy\": (preds == p.label_ids).mean()}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f59d37f9-d732-4543-90c0-315bedf9415b",
   "metadata": {},
   "outputs": [],
   "source": [
    "epochs = 2\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./bert_clf_results\",\n",
    "    num_train_epochs=epochs,\n",
    "    per_device_train_batch_size=16,\n",
    "    gradient_accumulation_steps=2,\n",
    "    per_device_eval_batch_size=32,\n",
    "    load_best_model_at_end=True,\n",
    "    \n",
    "    # some deep learning parameters that the Trainer is able to take in\n",
    "    warmup_ratio=0.1,\n",
    "    weight_decay = 0.05,\n",
    "    \n",
    "    logging_steps=1,\n",
    "    log_level='info',\n",
    "    evaluation_strategy='epoch',\n",
    "    eval_steps=50,\n",
    "    save_strategy='epoch'\n",
    ")\n",
    "\n",
    "# Define the trainer:\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=sequence_clf_model,\n",
    "    args=training_args,\n",
    "    train_dataset=dataset['train'],\n",
    "    eval_dataset=dataset['val'],\n",
    "    compute_metrics=compute_metrics,  # optional\n",
    "    data_collator=data_collator  # technically optional\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5bfdefac-a0d4-435f-be2f-1287547b4397",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 57613\n",
      "  Batch size = 32\n",
      "You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='3513' max='1801' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1801/1801 1:10:47]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mprofoz\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "wandb version 0.16.0 is available!  To upgrade, please run:\n",
       " $ pip install wandb --upgrade"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Tracking run with wandb version 0.15.7"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run data is saved locally in <code>/Users/sinanozdemir/Teaching/Pearson/oreilly-hands-on-transformers/notebooks/wandb/run-20231119_082851-b6ujyjgx</code>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Syncing run <strong><a href='https://wandb.ai/profoz/huggingface/runs/b6ujyjgx' target=\"_blank\">dulcet-energy-198</a></strong> to <a href='https://wandb.ai/profoz/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View project at <a href='https://wandb.ai/profoz/huggingface' target=\"_blank\">https://wandb.ai/profoz/huggingface</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View run at <a href='https://wandb.ai/profoz/huggingface/runs/b6ujyjgx' target=\"_blank\">https://wandb.ai/profoz/huggingface/runs/b6ujyjgx</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 1.554797887802124,\n",
       " 'eval_accuracy': 0.5844340686997727,\n",
       " 'eval_runtime': 376.4524,\n",
       " 'eval_samples_per_second': 153.042,\n",
       " 'eval_steps_per_second': 4.784}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "603b86ef-3da5-4c58-a593-7c8386229d8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running training *****\n",
      "  Num examples = 172,839\n",
      "  Num Epochs = 2\n",
      "  Instantaneous batch size per device = 16\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
      "  Gradient Accumulation steps = 2\n",
      "  Total optimization steps = 10,802\n",
      "  Number of trainable parameters = 65,785,349\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='10802' max='10802' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [10802/10802 2:01:54, Epoch 1/2]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.565400</td>\n",
       "      <td>0.827652</td>\n",
       "      <td>0.712738</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.856400</td>\n",
       "      <td>0.820736</td>\n",
       "      <td>0.717685</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 57613\n",
      "  Batch size = 32\n",
      "Saving model checkpoint to ./bert_clf_results/checkpoint-10802\n",
      "Configuration saved in ./bert_clf_results/checkpoint-10802/config.json\n",
      "Model weights saved in ./bert_clf_results/checkpoint-10802/pytorch_model.bin\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Loading best model from ./bert_clf_results/checkpoint-10802 (score: 0.8207359910011292).\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=10802, training_loss=0.8283754970834909, metrics={'train_runtime': 7323.4982, 'train_samples_per_second': 47.201, 'train_steps_per_second': 1.475, 'total_flos': 7015951744063140.0, 'train_loss': 0.8283754970834909, 'epoch': 2.0})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e117977-822c-460a-8b91-462eb6d4a340",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c55b7b6e-64f9-4da3-802f-f480c9045ac4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
